# ================================================================================================ #
# Introduction to Exponential Random Graph modeling (ERGM) 
# Benjamin Rosche
# SDL Workshop - April 2023
# ================================================================================================ #

library(dplyr)
library(statnet) # contains many packages (e.g., network, ergm, ...)
library(xergm) # remotes::install_github("https://github.com/leifeld/xergm")
# library(igraph)     # more mature package to work with network data 
# library(intergraph) # to move between network and igraph

rm(list=ls())

set.seed(1)

# ================================================================================================ #
# Helper functions
# ================================================================================================ #

invlogit <- function(x) return(exp(x)/(1+exp(x)))

SAS <- function(nodedat, vars=NULL, suffix) {
  # Return all variables if no variable-list specified
  if(is.null(vars)) {
    return(
      nodedat %>% 
        dplyr::rename_with(~paste0(., suffix), everything())
    )
  } else {
    return(
      nodedat %>% 
        dplyr::select(!!vars) %>% 
        dplyr::rename_with(~paste0(., suffix), everything())
    )
  }
}

# ================================================================================================ #
# Create data
# ================================================================================================ #

# To estimate an ergm, we must create a network() object. This object can either be created with a
# adjacency matrix or an edgelist. Here, I create the network object via an edgelist because it is 
# more common to receive data like this and to demonstrate an important issue with edgelists - edges 
# in the edgelist are assumed present and edges not in the edgelist are assumed absent.

n_nodes <- 50

edgelist <-
  expand.grid(id_from=1:n_nodes, id_to=1:n_nodes) %>% 
  filter(id_from!=id_to) %>% 
  mutate(edge=rbinom(2*choose(n_nodes,2), 1, 0.1)) %>% # edges have a 10% probability
  arrange(id_from, id_to)

edgelist %>% head() 

# Network object of friendships
g.friendship <- 
  edgelist %>% 
  filter(edge==1) %>% # (!)
  network(., matrix.type="edgelist", directed = T, multiple = F)

g.friendship
plot(g.friendship)

# The network object starts node and edge ids always at 1. You have to renumber your ids if they 
# do not start at 1. If your ids, for instance, start at 100000, the network object will assume
# that nodes 1-99999 exist, which will slow down processing.

# Node features
dat.nodeattrs <- data.frame(id=1:n_nodes, sex=rep(c(0,1), n_nodes/2))

dat.nodeattrs %>% head()

set.vertex.attribute(g.friendship, "sex", dat.nodeattrs %>% pull(sex))

g.friendship

# Edge features
dat.edgeattrs <- 
  edgelist %>% 
  left_join(
    SAS(dat.nodeattrs, c("id", "sex"), "_from"),
    by=c("id_from")
  ) %>% 
  left_join(
    SAS(dat.nodeattrs, c("id", "sex"), "_to"),
    by=c("id_to")
  ) %>% 
  mutate(samesex=as.numeric(sex_from==sex_to)) %>% 
  arrange(id_from, id_to) %>% 
  select(id_from, id_to, samesex)

dat.edgeattrs %>% head() # the data.frame must start with [id_from], [id_to]

# Note that we do not add the edge attribute "samesex" to the friendship network as we did with the 
# sex attribute. Instead, we treat it as a different network:

# Network object of sex relations
g.samesex <-
  dat.edgeattrs %>% 
  filter(id_from<id_to) %>% # from directed to undirected ties
  network(., directed = F, multiple = F, matrix.type="edgelist") # note that I specified directed=F'

g.samesex %e% "samesex" # any column after [id_from], [id_to] is added to the network object even though it does not show up in the object
  
# ================================================================================================ #
# 1 Our first ERGM ----
# ================================================================================================ #

ergm(g.friendship ~ edges) %>% summary()

invlogit(-2.16146)

edgelist %>% count(edge)
253/(253+2197)

# ================================================================================================ #
# 2 Searching ERGM terms ----
# ================================================================================================ #

?`ergm-terms` # list of all ERGM terms
search.ergmTerms(search = 'transitive') # you can search for terms

# ================================================================================================ #
# 3 Covariate effects ----
# ================================================================================================ #

## (i) Nodal covariates: nodeofactor, nodeifactor, nodefactor, nodeocov, nodeicov, nodecov ----

ergm(g.friendship ~ edges + nodeofactor("sex", levels=-1)) %>% summary()

## (ii) Dyadic covariates: edgecov, dyadcov, nodematch, absdiff, nodemix ----

g.friendship2 <- simulate_formula(g.friendship ~ edges + edgecov(g.samesex, "samesex"), coef=c(-1,1), seed=1) # Let's make samesex edges more likely

ergm(g.friendship2 ~ edges + nodematch("sex")) 
ergm(g.friendship2 ~ edges + edgecov(g.samesex, "samesex")) # edgecov and nodematch give you the same estimate

# Excursus: Why it is not a good idea to add edge covariates to the network your are analyzing:
set.edge.value(g.friendship2, "samesex", g.samesex %>% as.matrix.network(., attrname = "samesex"))
g.friendship2

ergm(g.friendship2 ~ edges + edgecov(g.friendship2, "samesex")) # effect estimate is Inf 

# This is because by adding an edge covariate to your network, you only give it to the existing friendships.
# However, the effect of samesex should be calculated by comparing friends to nonfriends 
(g.friendship2 %e% "samesex") %>% length() # g.friendship2 has 929 friendships
(g.samesex %e% "samesex") %>% length() # g.samesex, by contrast features all possible edges (50 2) = 1225

# ================================================================================================ #
# 4 Endogenous network mechanisms ----
# ================================================================================================ #

# Important endogenous mechanisms:
# - Reciprocity: mutual
# - Triadic closure: gwesp, gwdsp, ...
# - Popularity effect: gwidegree
# - Sociality effect:  gwodegree
# ...

# Let's add triadic closure
g.friendship3 <- simulate_formula(g.friendship ~ edges + edgecov(g.samesex, "samesex") + gwesp(0,T), coef=c(-3.5,1,1), seed=1) 

ergm(g.friendship3 ~ edges + edgecov(g.samesex, "samesex") + gwesp(0,T)) %>% summary()

# On the interpretation of the "curved" network statistics, such as gwesp:
# The gw* terms generalize the some endogenous network mechanisms that are highly correlated (e.g.,
# triangle and mutual). It is recommended to include gw* terms instead of other variants of these 
# terms because they lead to better model-fit. They are somewhat difficult to interpret. 

# Interpretation of gwesp:
# - gwesp (geometrically-Weighted Edgewise Shared Partnerships) models triadic closure
# - i and j have an edgewise shared partner if they are connected to the same node: i-k, j-k
# - gwesp includes a decay parameter to downweigh the increase in probability of a tie due to edge-wise shared partners as a function of the number of edge-wise shared partners
# - lower values of the decay parameter are easier to fit

# ================================================================================================ #
# 5 Interpreting effects (similar to marginal effects) ----
# ================================================================================================ #

m1 <- ergm(g.friendship3 ~ edges + nodematch("sex") + gwesp(0,T), control=control.ergm(seed=1))

# Edge probabilties: interpret()
interpret(m1, type = "tie", i = 1, j = 2) # dyad-level perspective
interpret(m1, type = "tie", i = 1, j = 3) 

dat.edgeattrs %>% head()

interpret(object = m1, type = "node", i = 1, j = 2:3) # node-level perspective

# Edge probabilities by value of network statistics: edgeprob()
alldyads <- edgeprob(m1)

alldyads %>% select(i, j, nodematch.sex, tie, probability) %>% head()

alldyads %>% 
  select(nodematch.sex, probability) %>% 
  group_by(nodematch.sex) %>% 
  summarise(p_mean=mean(probability))
# (0.229-0.0892)*100 = 14% difference

# ================================================================================================ #
# 6 Goodness-of-fit and MCMC convergence ----
# ================================================================================================ #

mcmc.diagnostics(m1) # Check whether MCMC algorithm has converged

m1.gof <- gof(m1) # Check model-fit

m1.gof 
m1.gof %>% plot()

# ================================================================================================ #
# 7 Sample constraints ----
# ================================================================================================ #

# In network data, there are often constraints induced by the same design. Here, assume that only 
# the five best friends were collected:

g.friendship4 <-
  g.friendship3 %>% 
  as.edgelist(., output = "tibble") %>% 
  rename(from=1, to=2) %>% 
  group_by(from) %>% 
  slice(1:5)

ergm(g.friendship4 ~ edges + edgecov(g.samesex, "samesex") + gwesp(0,T)) %>% summary() # the estimate of triadic closure is biased
ergm(g.friendship4 ~ edges + edgecov(g.samesex, "samesex") + gwesp(0,T), constraints = ~ bd(maxout=5)) %>% summary() # you can let the model retain only networks with 5 friends

# Important constraints:
# - ~bd(minin, maxin, minout, maxout) -> bounds on the degree distribution
# - ~edges -> constraints the number of edges to be the same
# - ~blockdiag("gid") -> constraints friendships within a specific block of nodes (e.g., classrooms or schools)

# ================================================================================================ #
# 8 Descriptive statistics ----
# ================================================================================================ #

summary(g.friendship ~ edges) 
summary(g.friendship ~ edges + mutual + triangle + nodemix("sex", levels2=T))

# This is an important feature of the statnet package! Rather than using the terms to estimate the 
# effect network statistics in generating the observed network, we can use the terms to count how 
# often they exist in the observed network. This also gives you an idea how the effect estimates
# are based on changes in these statistics.

# ================================================================================================ #
# 9 Examining network structure using ERGMs ----
# ================================================================================================ #

# For many social science research question, the presence of a tie is not of ultimate interest but
# the specific structures in a network. A question, for instance, could be how segregated networks
# are on the basis of sex. This can be done with ERGMs by simulating networks from the model.
# Rather than analyzing the resulting networks ourselves, we can choose to just monitor the network 
# statistics in which we are interested. By default, the simulate() function monitors all stats that  
# are in themodel function. However, we can also monitor features that we have not modeled (here:
# mutual).

m1 %>% simulate(., constraints =~edges, monitor=~mutual, nsim=1, output="stats", seed=1) 

# edges nodematch.sex gwesp.fixed.0 mutual
#   390           260           342     35

# 260/390*100 = 67% of all edges in the network generated by m1 are same-sex ties.

# We can now compare this to a null model of interest. For instance, a random network with the same
# number of edges:
simulate_formula(g.friendship3 ~ edges, coef=0, constraints =~edges, monitor =~nodematch("sex")+gwesp(0,T)+mutual, nsim=1, output="stats", seed=1)

# edges nodematch.sex gwesp.fixed.0 mutual
#   390           192           277     36

# 192/390*100 = 49% of all edges in a comparable random network are same-sex ties.

# The difference of 18% is comparable but not equal to the previous calculation. Note that I have 
# simulated only one network here. To generalize, we should simulate many networks and average 
# find the average effect across them.

# See lecture slides for a more elaborate application of this approach to examining how predictors 
# shape network structures...

# Further reading:
# - Robins, G., Pattison, P., & Woolcock, J. (2005). Small and other worlds: Global network structures from local processes. American Journal of Sociology, 110(4), 894-936.
# - Snijders, T. A., & Steglich, C. E. (2015). Representing micro–macro linkages by actor-based dynamic network models. Sociological methods & research, 44(2), 222-271. 
# - Duxbury, S. (forthcoming): A General Framework for Micro-Macro Analysis in Social Networks Research.

# eof
